{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current huggingface cache dir: /data1/mingjia/cache/huggingface\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/mingjia/.conda/envs/llm/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "######## HF CACHE (LOAD BEFORE HF PACKAGES) ########\n",
    "os.environ['HF_HOME'] = \"/data1/mingjia/cache/huggingface\"\n",
    "print(f\"Current huggingface cache dir: {os.environ['HF_HOME']}\")\n",
    "\n",
    "import math\n",
    "import random\n",
    "import numpy as np\n",
    "import torch\n",
    "from transformers import AutoModel, OPTForCausalLM, AutoTokenizer, LogitsProcessorList\n",
    "from transformers import LlamaForCausalLM, LlamaTokenizer\n",
    "from torch.cuda.amp import autocast\n",
    "from watermark_processor import WatermarkLogitsProcessor, WatermarkDetector\n",
    "\n",
    "hash_key = 15485863\n",
    "random.seed(hash_key)\n",
    "np.random.seed(hash_key)\n",
    "torch.manual_seed(hash_key)\n",
    "torch.cuda.manual_seed(hash_key)\n",
    "torch.backends.cudnn.deterministic = True\n",
    "torch.backends.cudnn.benchmark = False\n",
    "term_width = 80"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "ckpt_path = \"ckpt/opt/init_0.25_1.75_default.pth\"\n",
    "prompt = \"The Patently-O blog this week took a look at how the number of pending patent cases is changing. With the number of patent cases filed falling \\u2013 Lex Machina this week noted a 33% drop in October compared to October 2013, for example \\u2013 Patently-O dug into the pending cases to give another perspective on the state of patent litigation.\\nIts findings are consistent with the conclusion that the number of patent litigations is declining, although it noted that the number of pending cases is still well above that in October 2010.\\nThe Patently-O folks also had a look at how the USPTO is implementing Alice v CLS Bank in patent examination, in an article in the 2014 Patently-O Patent Law Journal. It ranked the top applicants with post-Alice allowance withdrawals. Unsurprisingly IBM, eBay and Microsoft were the top three assignees, named in 47, 19 and 15 applications respectively. Perhaps more surprisingly, gaming companies IGT and WMS Gaming are in the top 10. Also appearing are finance companies JP Morgan, American Express and Bank of America, which reflects the impact of Alice on financial services patents.\\nThe Supreme Court has declined to hear the appeal of a dispute over the copyright on Sherlock Holmes. On November 2 it left intact a ruling that 50 works by Sir Arthur Conan Doyle featuring the famous detective are in the public domain.\\nDoyle\\u2019s estate had been trying to get writer Leslie Klinger to pay a licence for using the Sherlock Holmes character in a compendium of new stories. A fee was paid for the first The New Annotated Sherlock Holmes but Klinger refused to pay a licence fee for the second installment.\\nIn June, the Seventh Circuit ruled that the copyright on 46 stories and four novels featuring Holmes were in the public domain. In August, Judge Richard Posner ordered Doyle\\u2019s estate to pay legal fees to Klinger and branded its demands \\u201ca form of extortion\\u201d.\\nThe past week threw up a bizarre copyright question: can you copyright a backside? As reported in a post on The IPKat blog, Kim Kardashian believes fitness model Jen Selter is copying her poses when posting pictures of her backside on Instagram. Kardashian has reportedly asked her lawyers to investigate whether there is a copyright infringement.\\nThe IPKat blog gave a detailed analysis of the issues involved, from an EU perspective at least. It quickly establishes that you cannot copyright a body part because it is neither a work nor the author\\u2019s own intellectual creation.The issue gets less clear when breast or bottom implants are involved, which could be seen as\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "##### LOAD MODEL #####\n",
    "\n",
    "device = \"cuda:1\" if torch.cuda.is_available() else \"cpu\"\n",
    "d_type = torch.float16\n",
    "\n",
    "model = OPTForCausalLM.from_pretrained(\"facebook/opt-1.3b\", torch_dtype=d_type).to(device)\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"facebook/opt-1.3b\", padding_side=\"left\")\n",
    "model.eval()\n",
    "model_simcse = AutoModel.from_pretrained(\"princeton-nlp/sup-simcse-roberta-base\", torch_dtype=d_type).to(device)\n",
    "model_simcse.eval()\n",
    "model_ppl = OPTForCausalLM.from_pretrained(\"facebook/opt-2.7b\", torch_dtype=d_type).to(device)\n",
    "model_ppl.eval()\n",
    "\n",
    "embed_matrix = OPTForCausalLM.from_pretrained(\"facebook/opt-1.3b\", torch_dtype=d_type).to(device).get_input_embeddings().weight        \n",
    "\n",
    "for name, param in model.named_parameters():\n",
    "    param.requires_grad = False\n",
    "for name, param in model_simcse.named_parameters():\n",
    "    param.requires_grad = False\n",
    "for name, param in model_ppl.named_parameters():\n",
    "    param.requires_grad = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "##### GENERATION #####\n",
    "\n",
    "watermark_processor = WatermarkLogitsProcessor(vocab=list(tokenizer.get_vocab().values()),\n",
    "                                            ckpt_path=ckpt_path, \n",
    "                                            embed_matrix = embed_matrix)\n",
    "\n",
    "input_ids = tokenizer(prompt, return_tensors=\"pt\")[\"input_ids\"].to(device)\n",
    "\n",
    "attention_masks = (input_ids != tokenizer.pad_token_id).long()\n",
    "prefix_len = input_ids.shape[1]\n",
    "\n",
    "sample = dict(\n",
    "    do_sample=True,\n",
    "    top_k=50,\n",
    "    top_p=1.0,\n",
    "    temperature=1.0,\n",
    "    attention_mask=attention_masks,\n",
    "    min_new_tokens=200,\n",
    "    max_new_tokens=200,\n",
    ")\n",
    "\n",
    "with torch.no_grad():\n",
    "    with autocast():\n",
    "        torch.manual_seed(hash_key)\n",
    "        output_no_wm = model.generate(input_ids, \n",
    "                                **sample)\n",
    "        \n",
    "        torch.manual_seed(hash_key)\n",
    "        output_w_wm = model.generate(input_ids, \n",
    "                                **sample,\n",
    "                                logits_processor=LogitsProcessorList([watermark_processor]))\n",
    "\n",
    "decoded_output_no_wm = tokenizer.batch_decode(output_no_wm[:,prefix_len:], skip_special_tokens=True)                    \n",
    "decoded_output_w_wm = tokenizer.batch_decode(output_w_wm[:,prefix_len:], skip_special_tokens=True)                    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "([' a work even though breasts and bottoms make up less than 5% of the body. Also, Selter is merely a photographer’s model who does not write or create new ideas. She merely serves as a photo model. If Selter is the author, or if Kardashian is an author for her famous backside, then the court would have to decide who is a more successful creator.\\nThe IPKat said that we might be heading for a legal tug-of-war. If Selter is the author and Kardashian is a famous photographer, then the issue will have to be decided by a court as to who is in a more positive light. The beauty influencer might think Selter’s models are taking a picture of Selter for a good cause where the photographer is not trying to copy the photos, but Selter would probably not think that’s the case.\\nAlso in IPKat news, The IPKat Blog caught up with an IPKat and her'],\n",
       " [' a work even though breasts are not actual body parts.\\nA few commentators have concluded that the situation is analogous to someone taking photos from the backside of another person’s face, or of another person’s hands or arms or other body part. It is also possible that the copyright claim exists as a secondary claim, so what applies is whether Selter is using the photographer’s photograph in order to obtain a creative impression of her own, in order to re-create the pose of which the photographer was inspired by a previously seen work of the photographer.\\nIf Kim is looking for a copyright, Selter is out of luck – an article on USIP ICT Law Blog points out that the backside, while an important part of the body, is not an original work. It may therefore be possible for Selter to argue that Kardashian’s representation of a backside, or any picture involving photographic re-creation of the photographer’s image and a'])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "decoded_output_no_wm, decoded_output_w_wm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "11.183034896850586"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "##### DETECTION #####\n",
    "\n",
    "with torch.no_grad():\n",
    "    watermark_detector = WatermarkDetector(vocab=list(tokenizer.get_vocab().values()),\n",
    "                                            device=device,\n",
    "                                            tokenizer_opt=tokenizer,\n",
    "                                            tokenizer_llama=None,\n",
    "                                            embed_matrix=embed_matrix,\n",
    "                                            ckpt_path=ckpt_path,\n",
    "                                            normalizers=\"\",\n",
    "                                            ignore_repeated_bigrams=False)\n",
    "    with autocast():\n",
    "        score_dict = watermark_detector.detect(decoded_output_w_wm[0])\n",
    "    z_score = score_dict['z_score'].item()\n",
    "z_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(12.692555303381242, 0.8056640625)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "##### PPL, SIMCSE #####\n",
    "\n",
    "cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6) # function for SimCSE\n",
    "\n",
    "with torch.no_grad():\n",
    "    # PPL\n",
    "    tokd_labels = output_w_wm.clone().detach()\n",
    "    attention_masks = (output_w_wm != tokenizer.pad_token_id).long()\n",
    "    tokd_labels[:,:prefix_len+1] = -100 \n",
    "    log_ppl = model_ppl(output_w_wm, attention_mask=attention_masks, labels=tokd_labels).loss.item()\n",
    "\n",
    "    # SimCSE: use the last 5 tokens of prompt and the generated text to compute.\n",
    "    attention_masks = torch.ones_like(output_w_wm[:, (prefix_len-5):]) # should all be 1, since we set the new_token = 200\n",
    "    embed_wm = model_simcse(output_w_wm[:, (prefix_len-5):], attention_mask=attention_masks, output_hidden_states=True, return_dict=True).pooler_output\n",
    "    embed_no_wm = model_simcse(output_no_wm[:, (prefix_len-5):], attention_mask=attention_masks, output_hidden_states=True, return_dict=True).pooler_output\n",
    "    simcse = cos(embed_wm[0], embed_no_wm[0]).item()  \n",
    "\n",
    "math.exp(log_ppl), simcse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "proteinchat",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
